{
 "cells": [
  {
   "cell_type": "code",
   "id": "initial_id",
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:00:44.093853Z",
     "start_time": "2024-08-18T14:00:37.646302Z"
    }
   },
   "source": [
    "from transformers import AutoModelForMaskedLM\n",
    "\n",
    "model_checkpoint = \"mbart-large-cc25\"\n",
    "model = AutoModelForMaskedLM.from_pretrained(model_checkpoint).to('cuda')"
   ],
   "outputs": [],
   "execution_count": 1
  },
  {
   "cell_type": "code",
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,return_tensors='pt')"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:00:44.914656Z",
     "start_time": "2024-08-18T14:00:44.099417Z"
    }
   },
   "id": "2406f58b3d2c9be4",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu-2/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "execution_count": 2
  },
  {
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "import re,os\n",
    "\n",
    "# 定义清洗函数\n",
    "def clean_text(text):\n",
    "    text = re.sub(r'[^\\w\\s]', '', text)  # 去除特殊符号\n",
    "    text = ' '.join(text.split())  # 去除多余的空格\n",
    "    return text\n",
    "\n",
    "# 读取并清洗数据\n",
    "def read_and_clean(file_paths):\n",
    "    cleaned_rows = []\n",
    "    names=os.listdir(file_paths)\n",
    "    for name in names:\n",
    "        file_path=os.path.join(file_paths,name)\n",
    "        with open(file_path, 'r', encoding='utf-8') as file:\n",
    "            for idx, line in enumerate(file):\n",
    "                columns = line.strip()\n",
    "                if name.endswith('nl'):\n",
    "                    nl = columns\n",
    "                    nl = clean_text(nl)\n",
    "                    cleaned_rows.append({'text': nl})\n",
    "                elif name.endswith('zh'):\n",
    "                    zh = columns\n",
    "                    zh = clean_text(zh)\n",
    "                    cleaned_rows.append({'text':zh})\n",
    "    \n",
    "    # 转换为 DataFrame\n",
    "    df = pd.DataFrame(cleaned_rows)\n",
    "    return df\n",
    "\n",
    "from datasets import Dataset\n",
    "\n",
    "# 加载并清洗数据\n",
    "train_df = read_and_clean('train/mono')\n",
    "\n",
    "# 转换为 Dataset 对象\n",
    "raw_datasets = Dataset.from_pandas(train_df)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:17.895526Z",
     "start_time": "2024-08-18T14:00:44.916376Z"
    }
   },
   "id": "563846921fd0511",
   "outputs": [],
   "execution_count": 3
  },
  {
   "cell_type": "code",
   "source": [
    "limited_dataset = raw_datasets.shuffle(seed=20).select(range(100000))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:19.423094Z",
     "start_time": "2024-08-18T14:01:17.897554Z"
    }
   },
   "id": "50d5e11c8da9e7",
   "outputs": [],
   "execution_count": 4
  },
  {
   "cell_type": "code",
   "source": [
    "split_datasets=raw_datasets.train_test_split(train_size=0.9,seed=20)\n",
    "split_datasets"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:19.456056Z",
     "start_time": "2024-08-18T14:01:19.426448Z"
    }
   },
   "id": "de6c3091c9d856eb",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text'],\n",
       "        num_rows: 90000\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text'],\n",
       "        num_rows: 10000\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 5
  },
  {
   "cell_type": "code",
   "source": [
    "def tokenize_function(examples):\n",
    "    result = tokenizer(examples[\"text\"])\n",
    "    if tokenizer.is_fast:\n",
    "        result[\"word_ids\"] = [result.word_ids(i) for i in range(len(result[\"input_ids\"]))]\n",
    "    return result\n",
    "\n",
    "\n",
    "# Use batched=True to activate fast multithreading!\n",
    "tokenized_datasets = split_datasets.map(\n",
    "    tokenize_function, batched=True, remove_columns=[\"text\"]\n",
    ")\n",
    "tokenized_datasets"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:23.576634Z",
     "start_time": "2024-08-18T14:01:19.457419Z"
    }
   },
   "id": "795ff5fbdc192766",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Map:   0%|          | 0/90000 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "a92cfa491d7e49a2ac5118bd5124ae9d"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "40f8913e51a342c6bda6fe546091fe31"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input_ids', 'attention_mask', 'word_ids'],\n",
       "        num_rows: 90000\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['input_ids', 'attention_mask', 'word_ids'],\n",
       "        num_rows: 10000\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "cell_type": "code",
   "source": [
    "chunk_size = 32"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:23.584189Z",
     "start_time": "2024-08-18T14:01:23.579436Z"
    }
   },
   "id": "9f433a717cf6c9a6",
   "outputs": [],
   "execution_count": 7
  },
  {
   "cell_type": "code",
   "source": [
    "def group_texts(examples):\n",
    "    # 拼接所有文本\n",
    "    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}\n",
    "    \n",
    "    # 计算拼接后的文本长度\n",
    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
    "    \n",
    "    # 计算完整块的数量\n",
    "    num_chunks = (total_length + chunk_size - 1) // chunk_size\n",
    "    \n",
    "    # 确保最后一个块填充到所需长度\n",
    "    padded_length = num_chunks * chunk_size\n",
    "    padding = padded_length - total_length\n",
    "    \n",
    "    # 将拼接的文本填充到所需长度\n",
    "    concatenated_examples = {k: t + [0] * padding for k, t in concatenated_examples.items()}\n",
    "    \n",
    "    # 按块大小拆分文本\n",
    "    result = {\n",
    "        k: [t[i : i + chunk_size] for i in range(0, padded_length, chunk_size)]\n",
    "        for k, t in concatenated_examples.items()\n",
    "    }\n",
    "    \n",
    "    # 创建新的标签列\n",
    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
    "    \n",
    "    return result\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:23.597837Z",
     "start_time": "2024-08-18T14:01:23.587290Z"
    }
   },
   "id": "62d4fb44c92dd868",
   "outputs": [],
   "execution_count": 8
  },
  {
   "cell_type": "code",
   "source": [
    "lm_datasets = tokenized_datasets.map(\n",
    "    group_texts, \n",
    "    batched=True\n",
    ")\n",
    "lm_datasets"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:38.381938Z",
     "start_time": "2024-08-18T14:01:23.600912Z"
    }
   },
   "id": "565100471035bae1",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Map:   0%|          | 0/90000 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "6614f3b1a9b642178cbaa54ed7925ce7"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "Map:   0%|          | 0/10000 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "f6ef0902cf504955a15da033e839bf96"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],\n",
       "        num_rows: 48375\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],\n",
       "        num_rows: 5326\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 9
  },
  {
   "cell_type": "code",
   "source": [
    "from transformers import DataCollatorForLanguageModeling\n",
    "\n",
    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:38.839663Z",
     "start_time": "2024-08-18T14:01:38.385230Z"
    }
   },
   "id": "216655e28c75f5dc",
   "outputs": [],
   "execution_count": 10
  },
  {
   "cell_type": "code",
   "source": [
    "from transformers import TrainingArguments\n",
    "\n",
    "batch_size = 4\n",
    "# Show the training loss with every epoch\n",
    "logging_steps = len(split_datasets[\"train\"]) // batch_size\n",
    "model_name = model_checkpoint.split(\"/\")[-1]\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=f\"{model_name}-finetuned\",\n",
    "    overwrite_output_dir=True,\n",
    "    eval_strategy=\"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    weight_decay=0.01,\n",
    "    per_device_train_batch_size=batch_size,\n",
    "    per_device_eval_batch_size=batch_size,\n",
    "    fp16=True,\n",
    "    logging_steps=logging_steps,\n",
    ")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:39.153211Z",
     "start_time": "2024-08-18T14:01:38.851659Z"
    }
   },
   "id": "b7367cbf7201643f",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
     ]
    }
   ],
   "execution_count": 12
  },
  {
   "cell_type": "code",
   "source": [
    "from transformers import Trainer\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=lm_datasets[\"train\"],\n",
    "    eval_dataset=lm_datasets[\"test\"],\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer,\n",
    ")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:39.327330Z",
     "start_time": "2024-08-18T14:01:39.157051Z"
    }
   },
   "id": "2697fad204395fda",
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.\n"
     ]
    }
   ],
   "execution_count": 13
  },
  {
   "cell_type": "code",
   "source": [
    "trainer.train()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-08-18T14:01:47.150276Z",
     "start_time": "2024-08-18T14:01:39.331154Z"
    }
   },
   "id": "81bb0bca0bf0dec8",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ],
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='18' max='36282' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [   18/36282 00:03 < 2:29:54, 4.03 it/s, Epoch 0.00/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table><p>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/conda-bld/pytorch_1656352465323/work/aten/src/ATen/native/cuda/ScatterGatherKernel.cu:144: operator(): block: [0,0,0], thread: [3,0,0] Assertion `idx_dim >= 0 && idx_dim < index_size && \"index out of bounds\"` failed.\n"
     ]
    },
    {
     "ename": "RuntimeError",
     "evalue": "CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1.",
     "output_type": "error",
     "traceback": [
      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[0;31mRuntimeError\u001B[0m                              Traceback (most recent call last)",
      "Cell \u001B[0;32mIn[14], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mtrainer\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtrain\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/trainer.py:1948\u001B[0m, in \u001B[0;36mTrainer.train\u001B[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001B[0m\n\u001B[1;32m   1946\u001B[0m         hf_hub_utils\u001B[38;5;241m.\u001B[39menable_progress_bars()\n\u001B[1;32m   1947\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m-> 1948\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43minner_training_loop\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m   1949\u001B[0m \u001B[43m        \u001B[49m\u001B[43margs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1950\u001B[0m \u001B[43m        \u001B[49m\u001B[43mresume_from_checkpoint\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mresume_from_checkpoint\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1951\u001B[0m \u001B[43m        \u001B[49m\u001B[43mtrial\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mtrial\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1952\u001B[0m \u001B[43m        \u001B[49m\u001B[43mignore_keys_for_eval\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mignore_keys_for_eval\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1953\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/trainer.py:2289\u001B[0m, in \u001B[0;36mTrainer._inner_training_loop\u001B[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001B[0m\n\u001B[1;32m   2286\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcontrol \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcallback_handler\u001B[38;5;241m.\u001B[39mon_step_begin(args, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstate, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcontrol)\n\u001B[1;32m   2288\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39maccelerator\u001B[38;5;241m.\u001B[39maccumulate(model):\n\u001B[0;32m-> 2289\u001B[0m     tr_loss_step \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mtraining_step\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmodel\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   2291\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[1;32m   2292\u001B[0m     args\u001B[38;5;241m.\u001B[39mlogging_nan_inf_filter\n\u001B[1;32m   2293\u001B[0m     \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m is_torch_xla_available()\n\u001B[1;32m   2294\u001B[0m     \u001B[38;5;129;01mand\u001B[39;00m (torch\u001B[38;5;241m.\u001B[39misnan(tr_loss_step) \u001B[38;5;129;01mor\u001B[39;00m torch\u001B[38;5;241m.\u001B[39misinf(tr_loss_step))\n\u001B[1;32m   2295\u001B[0m ):\n\u001B[1;32m   2296\u001B[0m     \u001B[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001B[39;00m\n\u001B[1;32m   2297\u001B[0m     tr_loss \u001B[38;5;241m+\u001B[39m\u001B[38;5;241m=\u001B[39m tr_loss \u001B[38;5;241m/\u001B[39m (\u001B[38;5;241m1\u001B[39m \u001B[38;5;241m+\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstate\u001B[38;5;241m.\u001B[39mglobal_step \u001B[38;5;241m-\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_globalstep_last_logged)\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/trainer.py:3328\u001B[0m, in \u001B[0;36mTrainer.training_step\u001B[0;34m(self, model, inputs)\u001B[0m\n\u001B[1;32m   3325\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m loss_mb\u001B[38;5;241m.\u001B[39mreduce_mean()\u001B[38;5;241m.\u001B[39mdetach()\u001B[38;5;241m.\u001B[39mto(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mdevice)\n\u001B[1;32m   3327\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcompute_loss_context_manager():\n\u001B[0;32m-> 3328\u001B[0m     loss \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcompute_loss\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmodel\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   3330\u001B[0m \u001B[38;5;28;01mdel\u001B[39;00m inputs\n\u001B[1;32m   3331\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[1;32m   3332\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mtorch_empty_cache_steps \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m   3333\u001B[0m     \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstate\u001B[38;5;241m.\u001B[39mglobal_step \u001B[38;5;241m%\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mtorch_empty_cache_steps \u001B[38;5;241m==\u001B[39m \u001B[38;5;241m0\u001B[39m\n\u001B[1;32m   3334\u001B[0m ):\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/trainer.py:3373\u001B[0m, in \u001B[0;36mTrainer.compute_loss\u001B[0;34m(self, model, inputs, return_outputs)\u001B[0m\n\u001B[1;32m   3371\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m   3372\u001B[0m     labels \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m-> 3373\u001B[0m outputs \u001B[38;5;241m=\u001B[39m \u001B[43mmodel\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43minputs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   3374\u001B[0m \u001B[38;5;66;03m# Save past state if it exists\u001B[39;00m\n\u001B[1;32m   3375\u001B[0m \u001B[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001B[39;00m\n\u001B[1;32m   3376\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39margs\u001B[38;5;241m.\u001B[39mpast_index \u001B[38;5;241m>\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m0\u001B[39m:\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/torch/nn/modules/module.py:1130\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *input, **kwargs)\u001B[0m\n\u001B[1;32m   1126\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m   1127\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m   1128\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m   1129\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1130\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1131\u001B[0m \u001B[38;5;66;03m# Do not call functions when jit is used\u001B[39;00m\n\u001B[1;32m   1132\u001B[0m full_backward_hooks, non_full_backward_hooks \u001B[38;5;241m=\u001B[39m [], []\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/accelerate/utils/operations.py:819\u001B[0m, in \u001B[0;36mconvert_outputs_to_fp32.<locals>.forward\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m    818\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mforward\u001B[39m(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs):\n\u001B[0;32m--> 819\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mmodel_forward\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/accelerate/utils/operations.py:807\u001B[0m, in \u001B[0;36mConvertOutputsToFp32.__call__\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m    806\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__call__\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs):\n\u001B[0;32m--> 807\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m convert_to_fp32(\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmodel_forward\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m)\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/torch/amp/autocast_mode.py:12\u001B[0m, in \u001B[0;36mautocast_decorator.<locals>.decorate_autocast\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m      9\u001B[0m \u001B[38;5;129m@functools\u001B[39m\u001B[38;5;241m.\u001B[39mwraps(func)\n\u001B[1;32m     10\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mdecorate_autocast\u001B[39m(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs):\n\u001B[1;32m     11\u001B[0m     \u001B[38;5;28;01mwith\u001B[39;00m autocast_instance:\n\u001B[0;32m---> 12\u001B[0m         \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mfunc\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/models/mbart/modeling_mbart.py:1472\u001B[0m, in \u001B[0;36mMBartForConditionalGeneration.forward\u001B[0;34m(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, inputs_embeds, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)\u001B[0m\n\u001B[1;32m   1469\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m decoder_input_ids \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;129;01mand\u001B[39;00m decoder_inputs_embeds \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m   1470\u001B[0m         decoder_input_ids \u001B[38;5;241m=\u001B[39m shift_tokens_right(labels, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mconfig\u001B[38;5;241m.\u001B[39mpad_token_id)\n\u001B[0;32m-> 1472\u001B[0m outputs \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmodel\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m   1473\u001B[0m \u001B[43m    \u001B[49m\u001B[43minput_ids\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1474\u001B[0m \u001B[43m    \u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mattention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1475\u001B[0m \u001B[43m    \u001B[49m\u001B[43mdecoder_input_ids\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdecoder_input_ids\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1476\u001B[0m \u001B[43m    \u001B[49m\u001B[43mencoder_outputs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mencoder_outputs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1477\u001B[0m \u001B[43m    \u001B[49m\u001B[43mdecoder_attention_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdecoder_attention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1478\u001B[0m \u001B[43m    \u001B[49m\u001B[43mhead_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mhead_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1479\u001B[0m \u001B[43m    \u001B[49m\u001B[43mdecoder_head_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdecoder_head_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1480\u001B[0m \u001B[43m    \u001B[49m\u001B[43mcross_attn_head_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mcross_attn_head_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1481\u001B[0m \u001B[43m    \u001B[49m\u001B[43mpast_key_values\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mpast_key_values\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1482\u001B[0m \u001B[43m    \u001B[49m\u001B[43minputs_embeds\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43minputs_embeds\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1483\u001B[0m \u001B[43m    \u001B[49m\u001B[43mdecoder_inputs_embeds\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdecoder_inputs_embeds\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1484\u001B[0m \u001B[43m    \u001B[49m\u001B[43muse_cache\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43muse_cache\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1485\u001B[0m \u001B[43m    \u001B[49m\u001B[43moutput_attentions\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_attentions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1486\u001B[0m \u001B[43m    \u001B[49m\u001B[43moutput_hidden_states\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_hidden_states\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1487\u001B[0m \u001B[43m    \u001B[49m\u001B[43mreturn_dict\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mreturn_dict\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1488\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1489\u001B[0m lm_logits \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mlm_head(outputs[\u001B[38;5;241m0\u001B[39m]) \u001B[38;5;241m+\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mfinal_logits_bias\n\u001B[1;32m   1491\u001B[0m masked_lm_loss \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/torch/nn/modules/module.py:1130\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *input, **kwargs)\u001B[0m\n\u001B[1;32m   1126\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m   1127\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m   1128\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m   1129\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1130\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1131\u001B[0m \u001B[38;5;66;03m# Do not call functions when jit is used\u001B[39;00m\n\u001B[1;32m   1132\u001B[0m full_backward_hooks, non_full_backward_hooks \u001B[38;5;241m=\u001B[39m [], []\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/models/mbart/modeling_mbart.py:1340\u001B[0m, in \u001B[0;36mMBartModel.forward\u001B[0;34m(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, head_mask, decoder_head_mask, cross_attn_head_mask, encoder_outputs, past_key_values, inputs_embeds, decoder_inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)\u001B[0m\n\u001B[1;32m   1337\u001B[0m     decoder_input_ids \u001B[38;5;241m=\u001B[39m shift_tokens_right(input_ids, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mconfig\u001B[38;5;241m.\u001B[39mpad_token_id)\n\u001B[1;32m   1339\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m encoder_outputs \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m-> 1340\u001B[0m     encoder_outputs \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mencoder\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m   1341\u001B[0m \u001B[43m        \u001B[49m\u001B[43minput_ids\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43minput_ids\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1342\u001B[0m \u001B[43m        \u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mattention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1343\u001B[0m \u001B[43m        \u001B[49m\u001B[43mhead_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mhead_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1344\u001B[0m \u001B[43m        \u001B[49m\u001B[43minputs_embeds\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43minputs_embeds\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1345\u001B[0m \u001B[43m        \u001B[49m\u001B[43moutput_attentions\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_attentions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1346\u001B[0m \u001B[43m        \u001B[49m\u001B[43moutput_hidden_states\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_hidden_states\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1347\u001B[0m \u001B[43m        \u001B[49m\u001B[43mreturn_dict\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mreturn_dict\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m   1348\u001B[0m \u001B[43m    \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1349\u001B[0m \u001B[38;5;66;03m# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True\u001B[39;00m\n\u001B[1;32m   1350\u001B[0m \u001B[38;5;28;01melif\u001B[39;00m return_dict \u001B[38;5;129;01mand\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(encoder_outputs, BaseModelOutput):\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/torch/nn/modules/module.py:1130\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *input, **kwargs)\u001B[0m\n\u001B[1;32m   1126\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m   1127\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m   1128\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m   1129\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1130\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1131\u001B[0m \u001B[38;5;66;03m# Do not call functions when jit is used\u001B[39;00m\n\u001B[1;32m   1132\u001B[0m full_backward_hooks, non_full_backward_hooks \u001B[38;5;241m=\u001B[39m [], []\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/models/mbart/modeling_mbart.py:971\u001B[0m, in \u001B[0;36mMBartEncoder.forward\u001B[0;34m(self, input_ids, attention_mask, head_mask, inputs_embeds, output_attentions, output_hidden_states, return_dict)\u001B[0m\n\u001B[1;32m    963\u001B[0m         layer_outputs \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_gradient_checkpointing_func(\n\u001B[1;32m    964\u001B[0m             encoder_layer\u001B[38;5;241m.\u001B[39m\u001B[38;5;21m__call__\u001B[39m,\n\u001B[1;32m    965\u001B[0m             hidden_states,\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m    968\u001B[0m             output_attentions,\n\u001B[1;32m    969\u001B[0m         )\n\u001B[1;32m    970\u001B[0m     \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m--> 971\u001B[0m         layer_outputs \u001B[38;5;241m=\u001B[39m \u001B[43mencoder_layer\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m    972\u001B[0m \u001B[43m            \u001B[49m\u001B[43mhidden_states\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    973\u001B[0m \u001B[43m            \u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    974\u001B[0m \u001B[43m            \u001B[49m\u001B[43mlayer_head_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mhead_mask\u001B[49m\u001B[43m[\u001B[49m\u001B[43midx\u001B[49m\u001B[43m]\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mif\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mhead_mask\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mis\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;28;43;01melse\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    975\u001B[0m \u001B[43m            \u001B[49m\u001B[43moutput_attentions\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_attentions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    976\u001B[0m \u001B[43m        \u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    978\u001B[0m     hidden_states \u001B[38;5;241m=\u001B[39m layer_outputs[\u001B[38;5;241m0\u001B[39m]\n\u001B[1;32m    980\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m output_attentions:\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/torch/nn/modules/module.py:1130\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *input, **kwargs)\u001B[0m\n\u001B[1;32m   1126\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m   1127\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m   1128\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m   1129\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1130\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1131\u001B[0m \u001B[38;5;66;03m# Do not call functions when jit is used\u001B[39;00m\n\u001B[1;32m   1132\u001B[0m full_backward_hooks, non_full_backward_hooks \u001B[38;5;241m=\u001B[39m [], []\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/models/mbart/modeling_mbart.py:453\u001B[0m, in \u001B[0;36mMBartEncoderLayer.forward\u001B[0;34m(self, hidden_states, attention_mask, layer_head_mask, output_attentions)\u001B[0m\n\u001B[1;32m    451\u001B[0m residual \u001B[38;5;241m=\u001B[39m hidden_states\n\u001B[1;32m    452\u001B[0m hidden_states \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mself_attn_layer_norm(hidden_states)\n\u001B[0;32m--> 453\u001B[0m hidden_states, attn_weights, _ \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mself_attn\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m    454\u001B[0m \u001B[43m    \u001B[49m\u001B[43mhidden_states\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mhidden_states\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    455\u001B[0m \u001B[43m    \u001B[49m\u001B[43mattention_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mattention_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    456\u001B[0m \u001B[43m    \u001B[49m\u001B[43mlayer_head_mask\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mlayer_head_mask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    457\u001B[0m \u001B[43m    \u001B[49m\u001B[43moutput_attentions\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43moutput_attentions\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m    458\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    459\u001B[0m hidden_states \u001B[38;5;241m=\u001B[39m nn\u001B[38;5;241m.\u001B[39mfunctional\u001B[38;5;241m.\u001B[39mdropout(hidden_states, p\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mdropout, training\u001B[38;5;241m=\u001B[39m\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtraining)\n\u001B[1;32m    460\u001B[0m hidden_states \u001B[38;5;241m=\u001B[39m residual \u001B[38;5;241m+\u001B[39m hidden_states\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/torch/nn/modules/module.py:1130\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *input, **kwargs)\u001B[0m\n\u001B[1;32m   1126\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m   1127\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m   1128\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m   1129\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1130\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1131\u001B[0m \u001B[38;5;66;03m# Do not call functions when jit is used\u001B[39;00m\n\u001B[1;32m   1132\u001B[0m full_backward_hooks, non_full_backward_hooks \u001B[38;5;241m=\u001B[39m [], []\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/transformers/models/mbart/modeling_mbart.py:275\u001B[0m, in \u001B[0;36mMBartAttention.forward\u001B[0;34m(self, hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask, output_attentions)\u001B[0m\n\u001B[1;32m    271\u001B[0m \u001B[38;5;66;03m# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be\u001B[39;00m\n\u001B[1;32m    272\u001B[0m \u001B[38;5;66;03m# partitioned across GPUs when using tensor-parallelism.\u001B[39;00m\n\u001B[1;32m    273\u001B[0m attn_output \u001B[38;5;241m=\u001B[39m attn_output\u001B[38;5;241m.\u001B[39mreshape(bsz, tgt_len, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39membed_dim)\n\u001B[0;32m--> 275\u001B[0m attn_output \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mout_proj\u001B[49m\u001B[43m(\u001B[49m\u001B[43mattn_output\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m    277\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m attn_output, attn_weights_reshaped, past_key_value\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/torch/nn/modules/module.py:1130\u001B[0m, in \u001B[0;36mModule._call_impl\u001B[0;34m(self, *input, **kwargs)\u001B[0m\n\u001B[1;32m   1126\u001B[0m \u001B[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001B[39;00m\n\u001B[1;32m   1127\u001B[0m \u001B[38;5;66;03m# this function, and just call forward.\u001B[39;00m\n\u001B[1;32m   1128\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_backward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_forward_pre_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_backward_hooks\n\u001B[1;32m   1129\u001B[0m         \u001B[38;5;129;01mor\u001B[39;00m _global_forward_hooks \u001B[38;5;129;01mor\u001B[39;00m _global_forward_pre_hooks):\n\u001B[0;32m-> 1130\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mforward_call\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m   1131\u001B[0m \u001B[38;5;66;03m# Do not call functions when jit is used\u001B[39;00m\n\u001B[1;32m   1132\u001B[0m full_backward_hooks, non_full_backward_hooks \u001B[38;5;241m=\u001B[39m [], []\n",
      "File \u001B[0;32m~/anaconda3/envs/slowfast/lib/python3.8/site-packages/torch/nn/modules/linear.py:114\u001B[0m, in \u001B[0;36mLinear.forward\u001B[0;34m(self, input)\u001B[0m\n\u001B[1;32m    113\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mforward\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;28minput\u001B[39m: Tensor) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m Tensor:\n\u001B[0;32m--> 114\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mF\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mlinear\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43minput\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mweight\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbias\u001B[49m\u001B[43m)\u001B[49m\n",
      "\u001B[0;31mRuntimeError\u001B[0m: CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1."
     ]
    }
   ],
   "execution_count": 14
  },
  {
   "cell_type": "code",
   "source": [
    "import math\n",
    "\n",
    "eval_results = trainer.evaluate()\n",
    "print(f\">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "87945a62d24707b0",
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
